{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3.11.3\n"
     ]
    }
   ],
   "source": [
    "#导入相关包\n",
    "import requests\n",
    "import lxml.etree as le\n",
    "import ktool\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 目标网站\n",
    "url = 'https://www.runoob.com/html/html-tutorial.html'\n",
    "# XPath语句\n",
    "x = '//div[@id=\"leftcolumn\"]/a/text()'\n",
    "y = '//div[@id=\"leftcolumn\"]/a/@href'\n",
    "# 网站源码\n",
    "content = requests.get(url).content\n",
    "# 原生的lxml\n",
    "# 把HTML源码转成XML\n",
    "contentx = le.HTML(content)\n",
    "# 进行XPath数据提取\n",
    "retx = contentx.xpath(x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "code_folding": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "HTML 教程\n",
      "HTML 简介\n",
      "HTML 编辑器\n",
      "HTML 基础\n",
      "HTML 元素\n",
      "HTML 属性\n",
      "HTML 标题\n",
      "HTML 段落\n",
      "HTML 文本格式化\n",
      "HTML 链接\n",
      "HTML 头部\n",
      "HTML CSS\n",
      "HTML 图像\n",
      "HTML 表格\n",
      "HTML 列表\n",
      "HTML 区块\n",
      "HTML 布局\n",
      "HTML 表单\n",
      "HTML 框架\n",
      "HTML 颜色\n",
      "HTML 颜色名\n",
      "HTML 颜色值\n",
      "HTML 脚本\n",
      "HTML 字符实体\n",
      "HTML URL\n",
      "HTML 速查列表\n",
      "HTML 标签简写及全称\n",
      "HTML 总结\n",
      "XHTML 简介\n",
      "HTML5 教程\n",
      "HTML5 浏览器支持\n",
      "HTML5 新元素\n",
      "HTML5 Canvas\n",
      "HTML5 内联 SVG\n",
      "HTML5 MathML\n",
      "HTML5 拖放\n",
      "HTML5 地理定位\n",
      "HTML5 Video(视频)\n",
      "HTML5 Audio(音频)\n",
      "HTML5 Input 类型\n",
      "HTML5 表单元素\n",
      "HTML5 表单属性\n",
      "HTML5 语义元素\n",
      "HTML5 Web 存储\n",
      "HTML5 Web SQL\n",
      "HTML5 应用程序缓存\n",
      "HTML5 Web Workers\n",
      "HTML5 SSE\n",
      "HTML5 WebSocket\n",
      "HTML5 测验\n",
      "HTML(5) 代码规范\n",
      "HTML 媒体(Media)\n",
      "HTML 插件\n",
      "HTML 音频(Audio)\n",
      "HTML 视频（Video）播放\n",
      "HTML 实例\n",
      "HTML 标签列表(字母排序)\n",
      "HTML 标签列表（功能排序）\n",
      "HTML 属性\n",
      "HTML 事件\n",
      "HTML 画布\n",
      "HTML 音频/视频\n",
      "HTML 有效DOCTYPES\n",
      "HTML 颜色名\n",
      "HTML 拾色器\n",
      "HTML 字符集\n",
      "HTML ASCII\n",
      "HTML ISO-8859-1\n",
      "HTML 符号\n",
      "HTML URL 编码\n",
      "HTML 语言代码\n",
      "HTTP 消息\n",
      "HTTP 方法\n",
      "键盘快捷键\n",
      "/html/html-tutorial.html\n",
      "/html/html-intro.html\n",
      "/html/html-editors.html\n",
      "/html/html-basic.html\n",
      "/html/html-elements.html\n",
      "/html/html-attributes.html\n",
      "/html/html-headings.html\n",
      "/html/html-paragraphs.html\n",
      "/html/html-formatting.html\n",
      "/html/html-links.html\n",
      "/html/html-head.html\n",
      "/html/html-css.html\n",
      "/html/html-images.html\n",
      "/html/html-tables.html\n",
      "/html/html-lists.html\n",
      "/html/html-blocks.html\n",
      "/html/html-layouts.html\n",
      "/html/html-forms.html\n",
      "/html/html-iframes.html\n",
      "/html/html-colors.html\n",
      "/html/html-colornames.html\n",
      "/html/html-colorvalues.html\n",
      "/html/html-scripts.html\n",
      "/html/html-entities.html\n",
      "/html/html-url.html\n",
      "/html/html-quicklist.html\n",
      "/html/html-tag-name.html\n",
      "/html/html-summary.html\n",
      "/html/html-xhtml.html\n",
      "/html/html5-intro.html\n",
      "html5-browsers.html\n",
      "/html/html5-new-element.html\n",
      "/html/html5-canvas.html\n",
      "/html/html5-svg.html\n",
      "html5-mathml.html\n",
      "/html/html5-draganddrop.html\n",
      "/html/html5-geolocation.html\n",
      "/html/html5-video.html\n",
      "/html/html5-audio.html\n",
      "/html/html5-form-input-types.html\n",
      "/html/html5-form-elements.html\n",
      "/html/html5-form-attributes.html\n",
      "/html/html5-semantic-elements.html\n",
      "/html/html5-webstorage.html\n",
      "html5-web-sql.html\n",
      "/html/html5-app-cache.html\n",
      "/html/html5-webworkers.html\n",
      "/html/html5-serversentevents.html\n",
      "/html/html5-websocket.html\n",
      "/quiz/html5-quiz.html\n",
      "/html/html5-syntax.html\n",
      "/html/html-media.html\n",
      "/html/html-object.html\n",
      "/html/html-sounds.html\n",
      "/html/html-videos.html\n",
      "/html/html-examples.html\n",
      "/tags/html-reference.html\n",
      "/tags/ref-byfunc.html\n",
      "/tags/ref-standardattributes.html\n",
      "/tags/ref-eventattributes.html\n",
      "/tags/ref-canvas.html\n",
      "/tags/ref-av-dom.html\n",
      "/tags/html-elementsdoctypes.html\n",
      "/tags/html-colorname.html\n",
      "/tags/html-colorpicker.html\n",
      "/charsets/html-charsets.html\n",
      "/tags/html-ascii.html\n",
      "/tags/ref-entities.html\n",
      "/tags/html-symbols.html\n",
      "/tags/html-urlencode.html\n",
      "/tags/html-language-codes.html\n",
      "/tags/html-httpmessages.html\n",
      "/tags/html-httpmethods.html\n",
      "/tags/html-keyboardshortcuts.html\n"
     ]
    }
   ],
   "source": [
    "#构造空列表\n",
    "datas = []\n",
    "#打印小节名称\n",
    "retx = ktool.xpath.xpath_all(content,x)\n",
    "#去除空格和换行\n",
    "ret_x = [retx.strip() for retx in retx if retx.strip() !='']\n",
    "for x in ret_x:\n",
    "    print(x)\n",
    "#打印小节对应链接\n",
    "rety = ktool.xpath.xpath_all(content,y)\n",
    "for y in rety:\n",
    "    print(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['https://www.runoob.com/html/html-tutorial.html', 'https://www.runoob.com/html/html-intro.html', 'https://www.runoob.com/html/html-editors.html', 'https://www.runoob.com/html/html-basic.html', 'https://www.runoob.com/html/html-elements.html', 'https://www.runoob.com/html/html-attributes.html', 'https://www.runoob.com/html/html-headings.html', 'https://www.runoob.com/html/html-paragraphs.html', 'https://www.runoob.com/html/html-formatting.html', 'https://www.runoob.com/html/html-links.html', 'https://www.runoob.com/html/html-head.html', 'https://www.runoob.com/html/html-css.html', 'https://www.runoob.com/html/html-images.html', 'https://www.runoob.com/html/html-tables.html', 'https://www.runoob.com/html/html-lists.html', 'https://www.runoob.com/html/html-blocks.html', 'https://www.runoob.com/html/html-layouts.html', 'https://www.runoob.com/html/html-forms.html', 'https://www.runoob.com/html/html-iframes.html', 'https://www.runoob.com/html/html-colors.html', 'https://www.runoob.com/html/html-colornames.html', 'https://www.runoob.com/html/html-colorvalues.html', 'https://www.runoob.com/html/html-scripts.html', 'https://www.runoob.com/html/html-entities.html', 'https://www.runoob.com/html/html-url.html', 'https://www.runoob.com/html/html-quicklist.html', 'https://www.runoob.com/html/html-tag-name.html', 'https://www.runoob.com/html/html-summary.html', 'https://www.runoob.com/html/html-xhtml.html', 'https://www.runoob.com/html/html5-intro.html', 'https://www.runoob.comhtml5-browsers.html', 'https://www.runoob.com/html/html5-new-element.html', 'https://www.runoob.com/html/html5-canvas.html', 'https://www.runoob.com/html/html5-svg.html', 'https://www.runoob.comhtml5-mathml.html', 'https://www.runoob.com/html/html5-draganddrop.html', 'https://www.runoob.com/html/html5-geolocation.html', 'https://www.runoob.com/html/html5-video.html', 'https://www.runoob.com/html/html5-audio.html', 'https://www.runoob.com/html/html5-form-input-types.html', 'https://www.runoob.com/html/html5-form-elements.html', 'https://www.runoob.com/html/html5-form-attributes.html', 'https://www.runoob.com/html/html5-semantic-elements.html', 'https://www.runoob.com/html/html5-webstorage.html', 'https://www.runoob.comhtml5-web-sql.html', 'https://www.runoob.com/html/html5-app-cache.html', 'https://www.runoob.com/html/html5-webworkers.html', 'https://www.runoob.com/html/html5-serversentevents.html', 'https://www.runoob.com/html/html5-websocket.html', 'https://www.runoob.com/quiz/html5-quiz.html', 'https://www.runoob.com/html/html5-syntax.html', 'https://www.runoob.com/html/html-media.html', 'https://www.runoob.com/html/html-object.html', 'https://www.runoob.com/html/html-sounds.html', 'https://www.runoob.com/html/html-videos.html', 'https://www.runoob.com/html/html-examples.html', 'https://www.runoob.com/tags/html-reference.html', 'https://www.runoob.com/tags/ref-byfunc.html', 'https://www.runoob.com/tags/ref-standardattributes.html', 'https://www.runoob.com/tags/ref-eventattributes.html', 'https://www.runoob.com/tags/ref-canvas.html', 'https://www.runoob.com/tags/ref-av-dom.html', 'https://www.runoob.com/tags/html-elementsdoctypes.html', 'https://www.runoob.com/tags/html-colorname.html', 'https://www.runoob.com/tags/html-colorpicker.html', 'https://www.runoob.com/charsets/html-charsets.html', 'https://www.runoob.com/tags/html-ascii.html', 'https://www.runoob.com/tags/ref-entities.html', 'https://www.runoob.com/tags/html-symbols.html', 'https://www.runoob.com/tags/html-urlencode.html', 'https://www.runoob.com/tags/html-language-codes.html', 'https://www.runoob.com/tags/html-httpmessages.html', 'https://www.runoob.com/tags/html-httpmethods.html', 'https://www.runoob.com/tags/html-keyboardshortcuts.html']\n"
     ]
    }
   ],
   "source": [
    "#计算rety的元素个数，以便拼接网址\n",
    "len(rety)\n",
    "runoob = ['https://www.runoob.com'] * len(rety)\n",
    "#构建空列表\n",
    "y = []\n",
    "for i in range(len(rety)):\n",
    "    y1 = runoob[i] + rety[i]\n",
    "    y.append(y1)\n",
    "print(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'x': <class 'str'>, 'y': <class 'str'>}, {'x': ['HTML 教程', 'HTML 简介', 'HTML 编辑器', 'HTML 基础', 'HTML 元素', 'HTML 属性', 'HTML 标题', 'HTML 段落', 'HTML 文本格式化', 'HTML 链接', 'HTML 头部', 'HTML CSS', 'HTML 图像', 'HTML 表格', 'HTML 列表', 'HTML 区块', 'HTML 布局', 'HTML 表单', 'HTML 框架', 'HTML 颜色', 'HTML 颜色名', 'HTML 颜色值', 'HTML 脚本', 'HTML 字符实体', 'HTML URL', 'HTML 速查列表', 'HTML 标签简写及全称', 'HTML 总结', 'XHTML 简介', 'HTML5 教程', 'HTML5 浏览器支持', 'HTML5 新元素', 'HTML5 Canvas', 'HTML5 内联 SVG', 'HTML5 MathML', 'HTML5 拖放', 'HTML5 地理定位', 'HTML5 Video(视频)', 'HTML5 Audio(音频)', 'HTML5 Input 类型', 'HTML5 表单元素', 'HTML5 表单属性', 'HTML5 语义元素', 'HTML5 Web 存储', 'HTML5 Web SQL', 'HTML5 应用程序缓存', 'HTML5 Web Workers', 'HTML5 SSE', 'HTML5 WebSocket', 'HTML5 测验', 'HTML(5) 代码规范', 'HTML 媒体(Media)', 'HTML 插件', 'HTML 音频(Audio)', 'HTML 视频（Video）播放', 'HTML 实例', 'HTML 标签列表(字母排序)', 'HTML 标签列表（功能排序）', 'HTML 属性', 'HTML 事件', 'HTML 画布', 'HTML 音频/视频', 'HTML 有效DOCTYPES', 'HTML 颜色名', 'HTML 拾色器', 'HTML 字符集', 'HTML ASCII', 'HTML ISO-8859-1', 'HTML 符号', 'HTML URL 编码', 'HTML 语言代码', 'HTTP 消息', 'HTTP 方法', '键盘快捷键'], 'y': ['https://www.runoob.com/html/html-tutorial.html', 'https://www.runoob.com/html/html-intro.html', 'https://www.runoob.com/html/html-editors.html', 'https://www.runoob.com/html/html-basic.html', 'https://www.runoob.com/html/html-elements.html', 'https://www.runoob.com/html/html-attributes.html', 'https://www.runoob.com/html/html-headings.html', 'https://www.runoob.com/html/html-paragraphs.html', 'https://www.runoob.com/html/html-formatting.html', 'https://www.runoob.com/html/html-links.html', 'https://www.runoob.com/html/html-head.html', 'https://www.runoob.com/html/html-css.html', 'https://www.runoob.com/html/html-images.html', 'https://www.runoob.com/html/html-tables.html', 'https://www.runoob.com/html/html-lists.html', 'https://www.runoob.com/html/html-blocks.html', 'https://www.runoob.com/html/html-layouts.html', 'https://www.runoob.com/html/html-forms.html', 'https://www.runoob.com/html/html-iframes.html', 'https://www.runoob.com/html/html-colors.html', 'https://www.runoob.com/html/html-colornames.html', 'https://www.runoob.com/html/html-colorvalues.html', 'https://www.runoob.com/html/html-scripts.html', 'https://www.runoob.com/html/html-entities.html', 'https://www.runoob.com/html/html-url.html', 'https://www.runoob.com/html/html-quicklist.html', 'https://www.runoob.com/html/html-tag-name.html', 'https://www.runoob.com/html/html-summary.html', 'https://www.runoob.com/html/html-xhtml.html', 'https://www.runoob.com/html/html5-intro.html', 'https://www.runoob.comhtml5-browsers.html', 'https://www.runoob.com/html/html5-new-element.html', 'https://www.runoob.com/html/html5-canvas.html', 'https://www.runoob.com/html/html5-svg.html', 'https://www.runoob.comhtml5-mathml.html', 'https://www.runoob.com/html/html5-draganddrop.html', 'https://www.runoob.com/html/html5-geolocation.html', 'https://www.runoob.com/html/html5-video.html', 'https://www.runoob.com/html/html5-audio.html', 'https://www.runoob.com/html/html5-form-input-types.html', 'https://www.runoob.com/html/html5-form-elements.html', 'https://www.runoob.com/html/html5-form-attributes.html', 'https://www.runoob.com/html/html5-semantic-elements.html', 'https://www.runoob.com/html/html5-webstorage.html', 'https://www.runoob.comhtml5-web-sql.html', 'https://www.runoob.com/html/html5-app-cache.html', 'https://www.runoob.com/html/html5-webworkers.html', 'https://www.runoob.com/html/html5-serversentevents.html', 'https://www.runoob.com/html/html5-websocket.html', 'https://www.runoob.com/quiz/html5-quiz.html', 'https://www.runoob.com/html/html5-syntax.html', 'https://www.runoob.com/html/html-media.html', 'https://www.runoob.com/html/html-object.html', 'https://www.runoob.com/html/html-sounds.html', 'https://www.runoob.com/html/html-videos.html', 'https://www.runoob.com/html/html-examples.html', 'https://www.runoob.com/tags/html-reference.html', 'https://www.runoob.com/tags/ref-byfunc.html', 'https://www.runoob.com/tags/ref-standardattributes.html', 'https://www.runoob.com/tags/ref-eventattributes.html', 'https://www.runoob.com/tags/ref-canvas.html', 'https://www.runoob.com/tags/ref-av-dom.html', 'https://www.runoob.com/tags/html-elementsdoctypes.html', 'https://www.runoob.com/tags/html-colorname.html', 'https://www.runoob.com/tags/html-colorpicker.html', 'https://www.runoob.com/charsets/html-charsets.html', 'https://www.runoob.com/tags/html-ascii.html', 'https://www.runoob.com/tags/ref-entities.html', 'https://www.runoob.com/tags/html-symbols.html', 'https://www.runoob.com/tags/html-urlencode.html', 'https://www.runoob.com/tags/html-language-codes.html', 'https://www.runoob.com/tags/html-httpmessages.html', 'https://www.runoob.com/tags/html-httpmethods.html', 'https://www.runoob.com/tags/html-keyboardshortcuts.html']}]\n"
     ]
    }
   ],
   "source": [
    "#定义数据格式\n",
    "datas = [\n",
    "    {\n",
    "        'x':str,\n",
    "        'y':str\n",
    "    }]\n",
    "#添加数据\n",
    "datas.append(\n",
    "    dict(\n",
    "        x = ret_x,\n",
    "        y = y\n",
    "    )\n",
    ")\n",
    "print(datas)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DataFrame已被成功写入\n"
     ]
    }
   ],
   "source": [
    "#创建DataFrame\n",
    "datas = pd.DataFrame({'name':ret_x,'url':y})\n",
    "#写入Excel\n",
    "writer = pd.ExcelWriter('D:\\爬虫结果数据.xlsx')\n",
    "datas.to_excel(writer)\n",
    "writer.save()\n",
    "print('DataFrame已被成功写入')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
